
clear all
clear mata
clear matrix
set more off
set scheme modern

*set path here*

global rawdata = 
global workingdata = 
global match = 
global temp = 
global table = 

*****************************
*regression data preparation*
*****************************

use $workingdata\workingdata.dta, clear

*merge air pollution*

joinby countyid day using $match\API.dta, unmatched(none)

*merge weather*

joinby countyid day using $match\weather.dta, unmatched(none)

joinby countyid day using $match\weather_cumul.dta, unmatched(master)
drop _merge

joinby countyid day using $match\cumulday.dta, unmatched(master)
drop _merge

*merge temperature group indicator*

joinby countyid using $match\temp_group.dta, unmatched(master)
drop _merge
replace temp_group=0 if temp_group==2

*generate variables*

gen tempbin1=(temp<=12) if temp!=.
gen tempbin2=(temp>12&temp<=14) if temp!=.
gen tempbin3=(temp>14&temp<=16) if temp!=.
gen tempbin4=(temp>16&temp<=18) if temp!=.
gen tempbin5=(temp>18&temp<=20) if temp!=.
gen tempbin6=(temp>20&temp<=22) if temp!=.
gen tempbin7=(temp>22&temp<=24) if temp!=.
gen tempbin8=(temp>24&temp<=26) if temp!=.
gen tempbin9=(temp>26&temp<=28) if temp!=.
gen tempbin10=(temp>28&temp<=30) if temp!=.
gen tempbin11=(temp>30&temp<=32) if temp!=.
gen tempbin12=(temp>32) if temp!=.

gen pre2=pre^2  //precipitation square
gen win2=win^2  //wind speed square
gen ssd2=ssd^2  //sunshine duration square
gen rhu2=rhu^2  //relative humidity square

gen t=1 if year==2010
replace t=2 if year==2014
replace t=3 if year==2018  //time trend

gen endhour=real(substr(ts_endiw,1,2))  //hour
gen iwtime=(real(substr(ts_endiw,1,2))*60+real(substr(ts_endiw,4,2)))-(real(substr(ts_beginiw,1,2))*60+real(substr(ts_beginiw,4,2))) if iwstartdate==iwenddate  //interview complete time

gen lnhincome_per=ln(hincome_per)  //log household per capita income
gen age2=age^2  //age square
gen month=month(day)  //month
gen week=dow(day)  //week

gen logwordtest=log(wordtest+1)  //log word test scores
gen logmathtest=log(mathtest+1)  //log math test scores

*adjust inconsistent data (gender and education)*

egen gender_adj=mean(gender), by(pid)
replace gender_adj=round(gender_adj)
replace gender=gender_adj
drop gender_adj  //adjust gender

sort pid year
bysort pid: gen a=_n
xtset pid a
gen gap=eduy-L1.eduy
gen eduy_adj=eduy
replace eduy_adj=L1.eduy_adj if gap<0&gap!=.
replace eduy=eduy_adj
drop eduy_adj  //adjust eduy

gen gap2=edulist-L1.edulist
clonevar edulist_adj=edulist
replace edulist_adj=L1.edulist_adj if gap2<0
replace edulist=edulist_adj
drop edulist_adj  //adjust edulist

*generate countyid in the first wave*

gen countyid10=countyid if a==1
replace countyid10=L1.countyid10 if a>1

drop a gap gap2

*generate group indicators*

*age_group*
egen age_avg=mean(age), by(pid)
replace age_avg=round(age_avg)
gen age_group=.
replace age_group=1 if age_avg<=30
replace age_group=2 if age_avg>30&age_avg<60
replace age_group=3 if age_avg>=60

*edu_group*
gen edu_group=.
replace edu_group=1 if eduy<=12
replace edu_group=2 if eduy>12

*non-migrant*
bysort pid: egen maxcountyid=max(countyid)
bysort pid: egen mincountyid=min(countyid)
gen non_migrant=(maxcountyid==mincountyid)
drop maxcountyid mincountyid

*non-school*
bysort pid: egen maxedulist=max(edulist_adj)
bysort pid: egen minedulist=min(edulist_adj)
gen non_school=(maxedulist==minedulist)
drop maxedulist minedulist

*label varaibles*
label variable tempbin1 "<12 C"
label variable tempbin2 "12-14 C"
label variable tempbin3 "14-16 C"
label variable tempbin4 "16-18 C"
label variable tempbin5 "18-20 C"
label variable tempbin6 "20-22 C"
label variable tempbin7 "22-24 C"
label variable tempbin8 "24-26 C"
label variable tempbin9 "26-28 C"
label variable tempbin10 "28-30 C"
label variable tempbin11 "30-32 C"
label variable tempbin12 ">32 C"

label variable tempbin1_sum_30 "<12 C"
label variable tempbin2_sum_30 "12-14 C"
label variable tempbin3_sum_30 "14-16 C"
label variable tempbin4_sum_30 "16-18 C"
label variable tempbin5_sum_30 "18-20 C"
label variable tempbin6_sum_30 "20-22 C"
label variable tempbin7_sum_30 "22-24 C"
label variable tempbin8_sum_30 "24-26 C"
label variable tempbin9_sum_30 "26-28 C"
label variable tempbin10_sum_30 "28-30 C"
label variable tempbin11_sum_30 "30-32 C"
label variable tempbin12_sum_30 ">32 C"

*set panel data*
xtset pid year

save $workingdata\regdata.dta, replace
